/*
* Sun Public License Notice
*
* The contents of this file are subject to the Sun Public License
* Version 1.0 (the "License"). You may not use this file except in
* compliance with the License. A copy of the License is available at
* http://www.sun.com/
*
* The Original Code is Forte for Java, Community Edition. The Initial
* Developer of the Original Code is Sun Microsystems, Inc. Portions
* Copyright 1997-2000 Sun Microsystems, Inc. All Rights Reserved.
*/
package org.netbeans.editor;
/**
* Lexical analyzer that works on a given text buffer. It allows
* to sequentially parse a given character buffer by calling
* <tt>nextToken()</tt> that returns the token-ids. Token-ids are
* the integer constants greater or equal to zero. They are usually
* sequential but generally they don't have to. The highest token-id
* must be assigned to the <tt>highestTokenID</tt> variable
* in the constructor of the given analyzer. The token-id numbers can be
* translated to the meaningful names by calling <tt>getTokenName()</tt>.
*
* After the token is found by calling the <tt>nextToken</tt> method,
* the <tt>getTokenOffset()</tt> method can be used
* to get the starting offset of the current
* token in the buffer. The <tt>getTokenLength()</tt> gives the length
* of the current token.
*
* The heart of the analyzer is the <tt>parseToken()</tt> method which
* parses the text and returns the token-id of the last token found.
* The <tt>parseToken()</tt> method is called from the <tt>nextToken()</tt>.
* It operates with two important variables. The <tt>offset</tt>
* variable identifies the currently scanned character in the buffer.
* The <tt>tokenOffset</tt> is the begining of the current token.
* The <tt>state</tt> variable that identifies the current internal
* state of the analyzer is set accordingly when the characters are parsed.
* If the <tt>parseToken()</tt> recognizes a token, it returns its ID
* and the <tt>tokenOffset</tt> is its begining in the buffer and
* <tt>offset - tokenOffset</tt> is its length. When the token is processed
* the value of <tt>tokenOffset</tt> is set to be the same as current
* value of the <tt>offset</tt> and the parsing continues.
*
* Internal states are the integer constants used internally by analyzer.
* They are assigned to the <tt>state</tt> variable to express
* that the analyzer has moved from one state to another.
* They are usually numbered starting from zero but they don't
* have to. The only reserved value is -1 which is reserved
* for the INIT state - the initial internal state of the analyzer.
*
* There is also the support for defining the persistent info about
* the current state of the analyzer. This info can be later used
* to restore the parsing from some particular state instead of
* parsing from the begining of the buffer. This feature is very
* useful if there are the modifications performed in the document.
* The info is stored in the <tt>StateInfo</tt> interface
* with the <tt>BaseStateInfo</tt> as the basic implementation.
* It enables to get and set the two important values
* from the persistent point of view.
* The first one is the value of the <tt>state</tt> variable.
* The other one is the difference <tt>offset - tokenOffset</tt>
* which is called pre-scan. The particular analyzer can define
* additional values important for the persistent storage.
* The <tt>createStateInfo()</tt> can be overriden to create
* custom state-info and <tt>loadState()</tt> and <tt>storeState()</tt>
* can be overriden to get/set the additional values.
*
* The <tt>load()</tt> method sets the buffer to be parsed.
*
*
* @author Miloslav Metelka
* @version 1.00
*/
public class Syntax {
/** Is the state of analyzer equal to a given state info? */
public static final int EQUAL_STATE = 0;
/** Is the state of analyzer different from given state info? */
public static final int DIFFERENT_STATE = 1;
/** Initial internal state of the analyzer */
public static final int INIT = -1;
/** Special token ID signaling invalid token. */
public static final int INVALID = -3;
/** Special token ID signaling that the end of the text buffer was reached. */
public static final int EOT = -2;
/** Special token ID signaling that the end of line was found. */
public static final int EOL = -1;
/** Token name describing invalid token ID */
public static final String TN_INVALID = "INVALID"; // NOI18N
/** Token name describing EOL */
public static final String TN_EOL = "EOL"; // NOI18N
/** Token name describing EOT */
public static final String TN_EOT = "EOT"; // NOI18N
// Some most common token names follow.
/** Token name describing plain text */
public static final String TN_TEXT = "text"; // NOI18N
/** Token name describing errorneous text */
public static final String TN_ERROR = "error"; // NOI18N
/** Token name describing a keyword */
public static final String TN_KEYWORD = "keyword"; // NOI18N
/** Token name describing an identifier */
public static final String TN_IDENTIFIER = "identifier"; // NOI18N
/** Token name describing a function call */
public static final String TN_FUNCTION = "function"; // NOI18N
/** Token name describing an identifier */
public static final String TN_OPERATOR = "operator"; // NOI18N
/** Token name describing line comment */
public static final String TN_LINE_COMMENT = "line-comment"; // NOI18N
/** Token name describing block comment */
public static final String TN_BLOCK_COMMENT = "block-comment"; // NOI18N
/** Token name describing character constant */
public static final String TN_CHAR = "char"; // NOI18N
/** Token name describing string constant */
public static final String TN_STRING = "string"; // NOI18N
/** Token name describing integer constant */
public static final String TN_INT = "int"; // NOI18N
/** Token name describing hexadecimal constant */
public static final String TN_HEX = "hex"; // NOI18N
/** Token name describing octal constant */
public static final String TN_OCTAL = "octal"; // NOI18N
/** Token name describing long constant */
public static final String TN_LONG = "long"; // NOI18N
/** Token name describing float constant */
public static final String TN_FLOAT = "float"; // NOI18N
/** Token name describing double constant */
public static final String TN_DOUBLE = "double"; // NOI18N
/** Internal state of the lexical analyzer. At the begining
* it's set to INIT value but it is changed by <tt>parseToken()</tt>
* as the characters are processed one by one.
*/
protected int state = INIT;
/** Text buffer to scan */
protected char buffer[];
/** Current offset in the buffer */
protected int offset;
/** Offset holding the begining of the current token */
protected int tokenOffset;
/** Holds the additional information about the token parsed.
* It can hold the concreate type of the keyword or operator
* for example. The filling
* and each token ID.
*/
protected int helperID;
/** This field is reserved for the future use. */
protected boolean lightError;
/** This variable is the length of the token that was found */
protected int tokenLength;
/** Setting this flag to true means that there will be no more
* buffers available so that analyzer should return all the tokens
* including those whose successful scanning would be otherwise
* left for later when the next buffer will be available.
*/
protected boolean lastBuffer;
/** On which offset in the buffer scanning should stop. */
protected int stopOffset;
/** The variable identifying the highest token ID used
* by the syntax or -1 if the syntax defines no tokens.
* This variable is used by <tt>getHighestTokenID()</tt>
* and should be assigned in the constructor.
*/
protected int highestTokenID = -1; // no tokens defined here
/** Function that should be called externally to scan the text.
* It manages the call to parseToken() and cares about the proper
* setting of the offsets.
* It can be extended to support any custom debugging required.
*/
public int nextToken() {
// Return immediately when at the end of buffer
if (offset >= stopOffset) {
tokenLength = 0;
return EOT;
}
// Divide non-debug and debug sections
int tokenID = parseToken();
if (tokenID >= EOL) { // regular token found
tokenLength = offset - tokenOffset;
tokenOffset = offset;
if (tokenLength == 0) { // test for empty token
return nextToken(); // repeat until non-empty token is found
}
} else { // EOT returned
tokenLength = 0;
}
return tokenID;
}
/** This is core function of analyzer and it returns one of following numbers:
* a) token number of next token from scanned text
* b) EOL when end of line was found in scanned buffer
* c) EOT when there is no more chars available in scanned buffer.
*
* The function scans the active character and does one or more
* of the following actions:
* 1. change internal analyzer state (state = new-state)
* 2. return token ID (return token-ID)
* 3. adjust current position to signal different end of token;
* the character that offset points to is not included in the token
*/
protected int parseToken() {
return EOT;
}
/** Load the state from syntax mark into analyzer. This method is used when
* @param chain chain of the mark states. It can be null
* @param buffer buffer that will be scanned
* @param offset offset of the first character that will be scanned
* @param len length of the area to be scanned
* @param lastBuffer whether this is the last buffer in the document. All the tokens
* will be returned including the last possibly incomplete one.
*/
public void load(StateInfo stateInfo, char buffer[], int offset, int len, boolean lastBuffer) {
this.buffer = buffer;
this.offset = offset;
this.tokenOffset = offset;
this.stopOffset = offset + len;
this.lastBuffer = lastBuffer;
if (stateInfo != null) {
loadState(stateInfo);
} else {
loadInitState();
}
}
/** Relocate scanning to another buffer.
* This is used to continue scanning after previously
* reported EOT. Relocation delta between current offset and the requested offset
* is computed and all the offsets are relocated. If there's a non-zero preScan
* in the analyzer, it is a caller's responsibility to provide all the preScan
* characters in the relocation buffer.
* @param buffer next buffer where the scan will continue.
* @param offset offset where the scan will continue.
* It's not decremented by the current preScan.
* @param len length of the area to be scanned.
* It's not extended by the current preScan.
* @param lastBuffer whether this is the last buffer in the document. All the tokens
* will be returned including the last possibly incomplete one.
*/
public void relocate(char buffer[], int offset, int len, boolean lastBuffer) {
this.buffer = buffer;
this.lastBuffer = lastBuffer;
int delta = offset - this.offset; // delta according to current offset
this.offset += delta;
this.tokenOffset += delta;
this.stopOffset = offset + len;
}
/** Set if this buffer is the last one. */
public void setLastBuffer(boolean lastBuffer) {
this.lastBuffer = lastBuffer;
}
/** Set the offset in buffer where scnning should stop.
* It forces the analyzer to stop explicitly at some
* offset in the buffer. It's used for example when the document
* is read initially.
*/
public void setStopOffset(int stopOffset) {
this.stopOffset = stopOffset;
}
/** Get the current buffer */
public final char[] getBuffer() {
return buffer;
}
/** Get the current scanning offset */
public final int getOffset() {
return offset;
}
/** Get start of token in scanned buffer. */
public final int getTokenOffset() {
return offset - tokenLength;
}
/** Get length of token in scanned buffer. */
public final int getTokenLength() {
return tokenLength;
}
/** Return the token ID in respect to specific syntax class.
* This method becomes handy when the syntax is composed
* from several other syntaxes.
*/
public int translateTokenID(int tokenID, Class syntaxClass) {
if (syntaxClass == null || this.getClass() == syntaxClass) {
return tokenID;
} else {
return INVALID;
}
}
/** Get the highest token ID. This method can be redefined although
* usually it's enough to assign the <tt>highestTokenID</tt> variable
* in the syntax constructor.
*/
public int getHighestTokenID() {
return highestTokenID;
}
/** Returns the token helper ID that if filled by the analyzer
* holds the additional information about the token parsed.
*/
public final int getHelperID() {
return helperID;
}
/** Get the pre-scan which is a number
* of characters between offset and tokenOffset.
* If there's no more characters in the current buffer,
* the analyzer returns EOT, but it can be in a state when
* there are already some characters parsed at the end of
* the current buffer but the token
* is still incomplete and it cannot be returned yet.
* The pre-scan value helps to determine how many characters
* from the end of the current buffer should be present
* at the begining of the next buffer so that the current
* incomplete token can be returned as the first token
* when parsing the next buffer.
*/
public int getPreScan() {
return offset - tokenOffset;
}
/** Initialize the analyzer when scanning from the begining
* of the document or when the state stored in syntax mark
* is null for some reason or to explicitly reset the analyzer
* to the initial state. The offsets must not be touched by this method.
*/
public void loadInitState() {
state = INIT;
}
public void reset() {
tokenLength = stopOffset = tokenOffset = offset = 0;
loadInitState();
}
/** Load valid mark state into the analyzer. Offsets
* are already initialized when this method is called. This method
* must get the state from the mark and set it to the analyzer. Then
* it must decrease tokenOffset by the preScan stored in the mark state.
* @param markState mark state to be loaded into syntax. It must be non-null value.
*/
public void loadState(StateInfo stateInfo) {
state = stateInfo.getState();
tokenOffset -= stateInfo.getPreScan();
}
/** Store state of this analyzer into given mark state. */
public void storeState(StateInfo stateInfo) {
stateInfo.setState(state);
stateInfo.setPreScan(getPreScan());
}
/** Compare state of this analyzer to given state info */
public int compareState(StateInfo stateInfo) {
if (stateInfo != null) {
return ((stateInfo.getState() == state) && stateInfo.getPreScan() == getPreScan())
? EQUAL_STATE : DIFFERENT_STATE;
} else {
return DIFFERENT_STATE;
}
}
/** Create state info appropriate for particular analyzer */
public StateInfo createStateInfo() {
return new BaseStateInfo();
}
/** Get the name of the token by knowing the tokenID. This method
* is used for finding the proper coloring and for the debugging purposes too.
*/
public String getTokenName(int tokenID) {
// test special token IDs
switch (tokenID) {
case EOL:
return TN_EOL;
case EOT:
return TN_EOT;
case INVALID:
return TN_INVALID;
default: // token ID not recognized
return "Unknown token ID " + tokenID; // NOI18N
}
}
/** Get state name as string. It can be used for debugging purposes
* by developer of new syntax analyzer. The states that this function
* recognizes can include all constants used in analyzer so that it can
* be used everywhere in analyzer to convert numbers to more practical strings.
*/
public String getStateName(int stateNumber) {
switch(stateNumber) {
case INIT:
return "INIT"; // NOI18N
default:
return "Unknown state " + stateNumber; // NOI18N
}
}
/** Syntax information as String */
public String toString() {
return "tokenOffset=" + tokenOffset // NOI18N
+ ", offset=" + offset // NOI18N
+ ", state=" + getStateName(state) // NOI18N
+ ", stopOffset=" + stopOffset // NOI18N
+ ", lastBuffer=" + lastBuffer; // NOI18N
}
/** Interface that stores two basic pieces of information about
* the state of the whole lexical analyzer - its internal state and preScan.
*/
public interface StateInfo {
/** Get the internal state */
public int getState();
/** Store the internal state */
public void setState(int state);
/** Get the preScan value */
public int getPreScan();
/** Store the preScan value */
public void setPreScan(int preScan);
}
/** Base implementation of the StateInfo interface */
public static class BaseStateInfo implements StateInfo {
/** analyzer state */
private int state;
/** Pre-scan length */
private int preScan;
public int getState() {
return state;
}
public void setState(int state) {
this.state = state;
}
public int getPreScan() {
return preScan;
}
public void setPreScan(int preScan) {
this.preScan = preScan;
}
public String toString(Syntax syntax) {
return "state=" + syntax.getStateName(getState()) + ", preScan=" + getPreScan(); // NOI18N
}
}
}
/*
* Log
* 30 Gandalf 1.29 1/13/00 Miloslav Metelka
* 29 Gandalf 1.28 1/7/00 Miloslav Metelka
* 28 Gandalf 1.27 1/6/00 Miloslav Metelka
* 27 Gandalf 1.26 1/4/00 Miloslav Metelka
* 26 Gandalf 1.25 12/28/99 Miloslav Metelka
* 25 Gandalf 1.24 10/23/99 Ian Formanek NO SEMANTIC CHANGE - Sun
* Microsystems Copyright in File Comment
* 24 Gandalf 1.23 9/16/99 Miloslav Metelka
* 23 Gandalf 1.22 9/15/99 Miloslav Metelka
* 22 Gandalf 1.21 9/10/99 Miloslav Metelka
* 21 Gandalf 1.20 8/27/99 Miloslav Metelka
* 20 Gandalf 1.19 8/17/99 Miloslav Metelka
* 19 Gandalf 1.18 7/26/99 Miloslav Metelka
* 18 Gandalf 1.17 7/20/99 Miloslav Metelka
* 17 Gandalf 1.16 7/2/99 Miloslav Metelka
* 16 Gandalf 1.15 6/22/99 Miloslav Metelka
* 15 Gandalf 1.14 6/8/99 Miloslav Metelka
* 14 Gandalf 1.13 6/1/99 Miloslav Metelka
* 13 Gandalf 1.12 5/24/99 Miloslav Metelka
* 12 Gandalf 1.11 5/21/99 Miloslav Metelka endInd removed; fix
* 11 Gandalf 1.10 5/15/99 Miloslav Metelka fixes
* 10 Gandalf 1.9 5/13/99 Miloslav Metelka
* 9 Gandalf 1.8 5/5/99 Miloslav Metelka
* 8 Gandalf 1.7 4/23/99 Miloslav Metelka Undo added and internal
* improvements
* 7 Gandalf 1.6 3/30/99 Miloslav Metelka
* 6 Gandalf 1.5 3/27/99 Miloslav Metelka
* 5 Gandalf 1.4 3/23/99 Miloslav Metelka
* 4 Gandalf 1.3 3/18/99 Miloslav Metelka
* 3 Gandalf 1.2 2/13/99 Miloslav Metelka
* 2 Gandalf 1.1 2/9/99 Miloslav Metelka
* 1 Gandalf 1.0 2/3/99 Miloslav Metelka
* $
*/